In [23]:
# Import libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
import re
import pickle

import os
path_dir = os.path.dirname(os.getcwd())

import plotly
plotly.offline.init_notebook_mode()
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio
pio.templates.default = "plotly_white"
pio.renderers.default='notebook'

%load_ext autoreload
%autoreload 2
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
In [3]:
cd ../src/
/Users/linafaik/Documents/survival_analysis/src
In [4]:
from train import *
from train_survival_ml import *
from train_survival_deep import *
In [5]:
df = pd.read_csv(os.path.join(path_dir, "outputs/customer_subscription_clean.csv"))
In [6]:
# Parameters

scaler_name = "StandardScaler" #MinMaxScaler
random_state = 123

1. Train / test split¶

In [7]:
# covariate columns (used when possible)

cols_x = [
    'price', 'billing_cycle', 'age',
    'product=prd_1', 'gender=female', 'channel=email', 'reason=support',
    'nb_cases', 'time_since_signup', 
    'date_month_cos', 'date_month_sin',
    'date_weekday_cos', 'date_weekday_sin', 'date_hour_cos',
    'date_hour_sin'
]

col_target = "duration"
In [8]:
Xy_train, Xy_test, y_train, y_test = split_train_test(
    df, cols_x, col_target, test_size=0.15, col_stratify= "censored", random_state=random_state)

Xy_train, Xy_val, y_train, y_val = split_train_test(
    Xy_train, cols_x, col_target, test_size=0.2,  col_stratify= "censored", random_state=random_state)

n_train, n_test, n_val = Xy_train.shape[0], Xy_test.shape[0], Xy_val.shape[0]
n_tot =  n_train + n_test + n_val

print("Train: {}%, Test: {}%, Val: {}%".format(
    round(n_train/n_tot *100),
    round(n_test/n_tot *100),
    round(n_val/n_tot *100)
))
Train: 68%, Test: 15%, Val: 17%
In [9]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# rescale
scaler = eval(scaler_name)()

Xy_train[cols_x] = scaler.fit_transform(Xy_train[cols_x])
Xy_test[cols_x] = scaler.transform(Xy_test[cols_x])

2. Kaplan-Meier estimator¶

In [10]:
from sksurv.nonparametric import kaplan_meier_estimator

time, probas = kaplan_meier_estimator(Xy_train["censored"].astype(bool), Xy_train[col_target])

fig = px.line(x=time, y=probas, width=800, height = 400)
fig.update_layout(dict(xaxis={'title' : 'Time (# days)'}, yaxis={'title' : 'Survival probability'}))
In [11]:
from sksurv.nonparametric import kaplan_meier_estimator

col = "product=prd_1"

for i, v in enumerate(df[col].unique()):
    
    Xy_train_filter = df[df[col] == v]

    time_prd, probas_prd = kaplan_meier_estimator(
        Xy_train_filter["censored"].astype(bool), Xy_train_filter[col_target])
    
    proba_df = pd.DataFrame(
        {'time': time_prd, col : v, 
         'proba': probas_prd}
    )
    
    preds = proba_df if i ==0 else pd.concat([proba_df, preds], axis=0)

preds.head()
Out[11]:
time product=prd_1 proba
0 1.0 0 0.998538
1 2.0 0 0.997063
2 3.0 0 0.995625
3 4.0 0 0.994052
4 5.0 0 0.992715
In [12]:
# product 1 = annual subscription
# product 2 = monthly subscription

map_product = {1: "annual subscription", 0:"monthly subscription"}

data = [
    go.Scatter(
        x=time,y=probas, 
        line_color = 'grey',
        name = "all products")
]

data += [
    go.Scatter(
        x=preds[preds[col] == v].time, 
        y=preds[preds[col] == v].proba, 
        #name = '{} {}'.format(col, v)
        name = map_product[v]
    ) for v in preds[col].unique()
]

data += [
    go.Scatter(
        x=[365*i, 365*i], y=[0, 1], 
        name=f"year {i}", line_color = 'lightgrey') 
    for i in range(1, 5)
]

fig = go.Figure(data)
fig

fig.update_layout(
    dict(
        width=800, height = 400,
        xaxis={'title' : 'nb days'}, 
        yaxis={'title' : 'proba'}
    )
)

3. Cox PH estimator¶

3.1 Model training & analysis¶

Training¶

In [13]:
from sksurv.linear_model import CoxPHSurvivalAnalysis

# train an estimator
estimator = CoxPHSurvivalAnalysis(alpha=0.5)
estimator = estimator.fit(Xy_train[cols_x], y_train)

Cumulative hazard functions¶

In [14]:
# number of customers
N = 5

# predict cumulative hazard function
rnd_idx = np.random.choice(Xy_test.index, N)
rnd_idx = [194386, 279993, 174244, 239372, 185588]
In [15]:
chf_funcs = estimator.predict_cumulative_hazard_function(Xy_test[cols_x].loc[rnd_idx])

data = [
    go.Scatter(
        x=fn.x,y= fn(fn.x), 
        name='customer {}'.format(i+1)) 
    for i, fn in enumerate(chf_funcs)
]

data += [
    go.Scatter(
        x=[365*i, 365*i], y=[0, 1], 
        name=f"year {i}", line_color = 'lightgrey') 
    for i in range(1, 5)
]

fig = go.Figure(data, layout=dict(width=800, height=400))
fig.update_layout({"yaxis":{"range": [0,1]}})

Survival functions¶

In [16]:
# predict survival function
surv_funcs = estimator.predict_survival_function(Xy_test[cols_x].loc[rnd_idx])

# plot results
data = [
    go.Scatter(
        x=fn.x,y= fn(fn.x), 
        name='customer {}'.format(i+1)) 
    for i, fn in enumerate(surv_funcs)
]

data += [
    go.Scatter(
        x=[365*i, 365*i], y=[0, 1], 
        name=f"year {i}", line_color = 'lightgrey') 
    for i in range(1, 5)
]

go.Figure(data, layout=dict(width=800, height=400))

Feature importance¶

In [24]:
feat_importance, fig = plot_feat_imp(cols_x, estimator.coef_)
fig.show()

3.2. Model evaluation¶

C-index¶

In [104]:
from sksurv.metrics import concordance_index_censored

prediction = estimator.predict(Xy_test[cols_x])
result = concordance_index_censored(list(Xy_test.censored.astype(bool)), Xy_test[col_target], prediction)
result
# c-index, concordant,  discordant, tied_risk, tied_time
Out[104]:
(0.6812899789827326, 603082578, 282117916, 24006, 352836)
In [111]:
from sksurv.metrics import concordance_index_ipcw

result = concordance_index_ipcw(y_train, y_test, prediction)
result
# c-index, concordant,  discordant, tied_risk, tied_time
Out[111]:
(0.6639232584980025, 603082578, 282117916, 24006, 352836)

Time-dependant AUC¶

In [ ]:
from sksurv.metrics import cumulative_dynamic_auc
risk_score = estimator.predict(Xy_test[cols_x]) 
In [131]:
#times = np.percentile(df[col_target], np.linspace(5, 81, 15))
times = np.arange(1, 365+1)

auc, mean_auc = cumulative_dynamic_auc(y_train, y_test, risk_score, times)
mean_auc
Out[131]:
0.7854491726177352
In [132]:
times = np.arange(365+1, 365*2+1)
auc, mean_auc = cumulative_dynamic_auc(y_train, y_test, risk_score, times)
mean_auc
Out[132]:
0.7176921175567605
In [136]:
times = np.percentile(df[col_target], np.linspace(5, 81, 15))
auc, mean_auc = cumulative_dynamic_auc(y_train, y_test, risk_score, times)

fig = px.line(x=times, y= auc, width=800, height=400)

fig = fig.add_traces([
    go.Scatter(
        x=[365*i, 365*i], y=[0, 1], 
        name=f"year {i}", line_color = 'lightgrey') 
    for i in range(1, int(max(times)/365)+1)
])


fig.update_layout({
    "xaxis": dict(title = "Time (#days)"),
    "yaxis": dict(title = "Time-dependent AUC")
})

Bier score¶

In [140]:
from sksurv.metrics import brier_score, integrated_brier_score
In [141]:
survs = estimator.predict_survival_function(Xy_test[cols_x])
In [142]:
T = 364/2
preds = [fn(T) for fn in survs]
times, score = brier_score(y_train, y_test, preds, T)
score
Out[142]:
array([0.13207416])
In [146]:
times = np.arange(1, 365)
preds = np.row_stack([ fn(times) for fn in survs])

score = integrated_brier_score(y_train, y_test, preds, times)
print(score)
In [160]:
times = np.arange(1, 365)
survs = estimator.predict_survival_function(Xy_test[cols_x])
get_bier_score(Xy_test, y_train, y_test, survs, times)
Out[160]:
{'estimator': 0.12473187839495915,
 'random': 0.25001400864150725,
 'kaplan_meier': 0.1475128902478881}
In [161]:
times = np.arange(1, 365*3)
survs = estimator.predict_survival_function(Xy_test[cols_x])
get_bier_score(Xy_test, y_train, y_test, survs, times)
Out[161]:
{'estimator': 0.169087465222658,
 'random': 0.2501781867945744,
 'kaplan_meier': 0.19564116760972858}

3.3. Model fine-tuning¶

In [174]:
from sklearn.model_selection import KFold

cv = KFold(n_splits=5, random_state=random_state, shuffle=True)
In [177]:
from sksurv.linear_model import CoxnetSurvivalAnalysis

grid_params = {
    'l1_ratio': [0.1, 0.5, 0.9, 1],
    'alpha_min_ratio': [0.01],
}

estimator_cox, results = grid_search(
    grid_params, Xy_train, cv, CoxnetSurvivalAnalysis, cols_x,  col_target, verbose = True)
4 total scenario to run
1/4: params: {'l1_ratio': 0.1, 'alpha_min_ratio': 0.01}
Fold 0: 0.682
Fold 1: 0.682
Fold 2: 0.682
Fold 3: 0.681
Fold 4: 0.682
2/4: params: {'l1_ratio': 0.5, 'alpha_min_ratio': 0.01}
Fold 0: 0.681
Fold 1: 0.682
Fold 2: 0.682
Fold 3: 0.681
Fold 4: 0.682
3/4: params: {'l1_ratio': 0.9, 'alpha_min_ratio': 0.01}
Fold 0: 0.681
Fold 1: 0.682
Fold 2: 0.682
Fold 3: 0.681
Fold 4: 0.682
4/4: params: {'l1_ratio': 1, 'alpha_min_ratio': 0.01}
Fold 0: 0.681
Fold 1: 0.682
Fold 2: 0.682
Fold 3: 0.681
Fold 4: 0.682
In [178]:
results
Out[178]:
l1_ratio alpha_min_ratio fold_0 fold_1 fold_2 fold_3 fold_4 time mean std
0 0.1 0.01 0.681810 0.682205 0.681779 0.681432 0.682394 0.959791 0.681924 0.000379
1 0.5 0.01 0.681438 0.682018 0.681717 0.681297 0.682325 1.437839 0.681759 0.000420
2 0.9 0.01 0.681468 0.681959 0.681534 0.681278 0.682319 0.891154 0.681712 0.000421
3 1.0 0.01 0.681468 0.681991 0.681521 0.681277 0.682321 0.943610 0.681716 0.000428
In [179]:
estimator_cox.score(Xy_val[cols_x], y_val)
Out[179]:
0.6676465625254092
In [180]:
with open(os.path.join(path_dir, "outputs/cox_ph.pkl"), "wb") as f:
    pickle.dump(estimator_cox, f)
In [190]:
feat_importance_cox, _ = plot_feat_imp(cols_x, estimator_cox.coef_[:,-1])

4. Gradient Boosting Survival Analysis¶

In [ ]:
from sksurv.ensemble import GradientBoostingSurvivalAnalysis

grid_params = {
    "n_estimators": [20*i for i in range(1,6)],
    "max_depth": [2],
    #"min_samples_leaf": [2, 3],
    "learning_rate": [0.1],
    #"subsample": [0.5],
    "random_state": [random_state],
    "verbose":[1]}

estimator_gb, results = grid_search(
    grid_params, Xy_train, cv, GradientBoostingSurvivalAnalysis, 
    cols_x, col_target, verbose = True)

results
<frozen importlib._bootstrap>:228: RuntimeWarning:

sklearn.tree._criterion.Criterion size changed, may indicate binary incompatibility. Expected 328 from C header, got 528 from PyObject

<frozen importlib._bootstrap>:228: RuntimeWarning:

sklearn.tree._splitter.Splitter size changed, may indicate binary incompatibility. Expected 1160 from C header, got 1360 from PyObject

<frozen importlib._bootstrap>:228: RuntimeWarning:

sklearn.tree._criterion.ClassificationCriterion size changed, may indicate binary incompatibility. Expected 1168 from C header, got 1368 from PyObject

<frozen importlib._bootstrap>:228: RuntimeWarning:

sklearn.tree._criterion.RegressionCriterion size changed, may indicate binary incompatibility. Expected 960 from C header, got 1160 from PyObject

5 total scenario to run
1/5: params: {'n_estimators': 20, 'max_depth': 2, 'learning_rate': 0.1, 'random_state': 123, 'verbose': 1}
      Iter       Train Loss   Remaining Time 
         1     1518066.9232          152.69m
         2     1516536.4925          144.96m
         3     1515189.8359          181.59m
In [ ]:
estimator_gb.score(Xy_val[cols_x], y_val)
In [ ]:
feat_importance_gb, fig = plot_feat_imp(cols_x, estimator_gb.feature_importances_)
fig
In [ ]:
with open(os.path.join(path_dir, "outputs/gradient_boosting.pkl"), "wb") as f:
    pickle.dump(estimator_gb, f)

5. Survival Support Vector Machine¶

In [ ]:
from sksurv.svm import FastSurvivalSVM 
In [ ]:
from sksurv.svm import FastSurvivalSVM 

grid_params = {
    "alpha": [1,2, 5, 10],
    "rank_ratio": [0],
    "max_iter": [1000],
    "tol": [1e-5],
    "random_state": [random_state],
    "verbose":[0]}

estimator_svm, results = grid_search(grid_params, df, cv, FastSurvivalSVM, cols_x, col_target, verbose = True)

results
In [ ]:
with open(os.path.join(path_dir, "outputs/svm.pkl"), "wb") as f:
    pickle.dump(estimator_svm, f)
In [ ]:
from sksurv.svm import FastKernelSurvivalSVM 

grid_params = {
    "kernel": ["linear","poly","rbf","sigmoid","cosine"],
    "alpha": [2],
    "rank_ratio": [0],
    "max_iter": [1000],
    "tol": [1e-5],
    "random_state": [random_state]
}

estimator_ksvm, results = grid_search(grid_params, df, cv, FastKernelSurvivalSVM, cols_x, col_target, verbose = True)

results
In [ ]:
with open(os.path.join(path_dir, "outputs/ksvm.pkl"), "wb") as f:
    pickle.dump(estimator_ksvm, f)